This week is Modes Less Traveled - Bicycling and Walking to Work in the United States: 2008-2012 from the ACS.
The commute.csv dataset on Github was tidied from 6 raw excel files. The code and final dataset was already provided.
commute_mode <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-11-05/commute.csv")
acs_data <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-11-05/table_3.csv")
# previously geocoded. code below, but not run in RMD
#commute_mode_geo <- readxl::read_excel("2019w45/TT_2019w45_geo.xlsx")
commute_mode_geo <- readxl::read_excel("TT_2019w45_geo.xlsx")
# cleaning code from:
# https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-11-05
# Load Packages -----------------------------------------------------------
library(tidyverse)
library(readxl)
library(here)
library(glue)
library(janitor)
# Read in Data ------------------------------------------------------------
table_num <- 1:6
# Generic read function for this dataset
supp_read <- function(number, ...){
read_excel(here("2019", "2019-11-05", glue::glue("supplemental-table{number}.xlsx")), ...)
}
# 3 datasets for bikes, each of which has a corresponding City Size
small_bike <- supp_read(1, skip = 5) %>%
clean_names() %>%
mutate(city_size = "Small",
percentage_of_workers = as.numeric(percentage_of_workers),
margin_of_error_2 = as.numeric(margin_of_error_2))
medium_bike <- supp_read(2, skip = 5) %>%
clean_names() %>%
mutate(city_size = "Medium")
large_bike <- supp_read(3, skip = 5) %>%
clean_names() %>%
mutate(city_size = "Large")
# Combine datasets
full_bike <- bind_rows(small_bike, medium_bike, large_bike) %>%
set_names(nm = c("city", "n", "percent", "moe", "city_size")) %>%
mutate(mode = "Bike")
# 3 datasets for walking, each of which has a corresponding City Size
small_walk <- supp_read(4, skip = 5) %>%
clean_names() %>%
mutate(city_size = "Small")
medium_walk <- supp_read(5, skip = 5) %>%
clean_names() %>%
mutate(city_size = "Medium")
large_walk <- supp_read(6, skip = 5) %>%
clean_names() %>%
mutate(city_size = "Large")
# Combine datasets
full_walk <- bind_rows(small_walk, medium_walk, large_walk) %>%
set_names(nm = c("city", "n", "percent", "moe", "city_size")) %>%
mutate(mode = "Walk")
# Built in state-level datasets
state_df <- tibble(
state = state.name,
state_abb = state.abb,
state_region = as.character(state.region)
)
# Combine bike and walk data in tidy setup
full_commute <-
bind_rows(full_bike, full_walk) %>%
filter(!is.na(n),
# There are some government-related areas that don't align with cities
!str_detect(tolower(city), "government|goverment")) %>%
separate(city, into = c("city", "state"), sep = ", ") %>%
select(city, state, city_size, mode, everything()) %>%
left_join(state_df, by = c("state"))
full_commute %>%
write_csv(here("2019", "2019-11-05", "commute.csv"))
Code for tidying summary tables by demographics was also provided as below.
# ACS Data ----------------------------------------------------------------
# acs_data <- read_csv(here("2019", "2019-11-05", "table_3.csv"))
age_data <- acs_data %>%
slice(1:6)
#knitr::kable(age_data)
gender_data <- acs_data %>%
slice(9:10) %>%
rename("gender" = age)
#knitr::kable()
race_data <- acs_data %>%
slice(13:18) %>%
rename("race" = age) %>%
knitr::kable()
children_data <- acs_data %>%
slice(20:24) %>%
rename("children" = age) %>%
knitr::kable()
income_data <- acs_data %>%
slice(27:36) %>%
rename("income" = age) %>%
knitr::kable()
education_data <- acs_data %>%
slice(39:43) %>%
rename("education" = age) %>%
knitr::kable()
First, look at data using head:
head(commute_mode)
Still some small things to tidy:
commute_mode %>%
filter(is.na(state_region))
# DC is south, CA is West
commute_mode$state_abb[which(commute_mode$city == "El Paso de Robles (Paso Robles) city")] <- "CA"
commute_mode$state_region[which(commute_mode$city == "El Paso de Robles (Paso Robles) city")] <- "West"
commute_mode$state_abb[which(commute_mode$city == "West Springfield Town city")] <- "MA"
commute_mode$state_region[which(commute_mode$city == "West Springfield Town city")] <- "Northeast"
commute_mode$state_abb[which(commute_mode$city == "Washington city")] <- "DC"
commute_mode$state_region[which(commute_mode$city == "Washington city")] <- "South"
USA <- usmap::us_map()
# check if state names and things are spelled correctly
commute_mode %>% anti_join(USA, by = c("state" = "full"))
commute_mode %>% anti_join(USA, by = c("state_abb" = "abbr"))
Next, I clean the added words “village”, “town”, “city”, etc from the end of each city name, create a concatenation of city and state, and geocode using google maps. I saved this as a CSV to save time when re-running the file. The code is commented out, but shown for completeness.
# remove last word in each city name
commute_mode$city2 <- gsub("\\s*\\w*$", "", commute_mode$city)
# city state together for geocoding
commute_mode$city_state <- paste(commute_mode$city2, commute_mode$state_abb)
# commute_mode_geo <- mutate_geocode(commute_mode, city_state)
#
# openxlsx::write.xlsx(commute_mode_geo, "TT_2019w45_geo.xlsx")
I used choroplethrMaps to get the USA state-level data, which I joined to state averages of walk and bike travel.
# filter to map each mode
# color each state by avg
commute_mode_geo_bike <- commute_mode_geo %>%
filter(mode == "Bike") %>%
mutate(state = tolower(state))
state_bike <- commute_mode_geo_bike %>%
group_by(state) %>%
mutate(state_avg = mean(percent)) %>%
select(state, state_avg)
commute_mode_geo_walk <- commute_mode_geo %>%
filter(mode == "Walk") %>%
mutate(state = tolower(state))
state_walk <- commute_mode_geo_walk %>%
group_by(state) %>%
mutate(state_avg = mean(percent)) %>%
select(state, state_avg)
# polygon for states
data(state.map)
US_map_bike <- left_join(state.map, state_bike, by = c("region" = "state")) %>%
distinct_all()
US_map_walk <- left_join(state.map, state_walk, by = c("region" = "state")) %>%
distinct_all()
Below are box plots of mode of travel by city size and region. There are more walkers, more variation in small cities, and it looks like the northeast may have more walkers than other regions. The Northeast
ggplot(data = commute_mode,
aes(x = city_size, y= percent, fill= state_region)) +
geom_boxplot() +
facet_wrap(. ~ mode, scales = "free") +
labs(title='Travel to work by city size and region',
x='City Size',
y='Percent of travel',
fill = '') +
theme(legend.position = "bottom")
Now I map the data. Each point represents a city in the dataset. The points and states are colored according to the percent of mode of travel, by city or averaged over the whole state. I also piped to plotly so the maps are interactive.
US_bike <- US_map_bike %>%
ggplot(aes(long, lat,
group = group, fill = state_avg)) +
geom_polygon(color = "black") +
scale_fill_distiller(palette = "Spectral") +
theme(panel.border = element_blank()) +
geom_point(data = commute_mode_geo_bike,
aes(lon, lat,
color = percent),
inherit.aes = FALSE) +
scale_color_gradientn(colours = c("blue4", "yellow", "red")) +
labs(title='Biking to work in the USA', x='', y='',
fill = "state bike average",
color = "city bike percent") +
theme_void() +
theme(legend.position = "bottom")
US_bike
plotly::ggplotly(US_bike)
walk_map <- US_map_walk %>%
filter(long<0) %>%
ggplot(aes(long, lat,
group = group, fill = state_avg)) +
geom_polygon(color = "black") +
scale_fill_distiller(palette = "Spectral") +
theme(panel.border = element_blank()) +
geom_point(data = commute_mode_geo_walk,
aes(lon, lat,
color = percent),
inherit.aes = FALSE) +
scale_color_gradientn(colours = c("blue4", "yellow", "red")) +
labs(title='Walking to work in the USA', x='', y='',
fill = "state walk average",
color = "city walk percent") +
theme_void() +
theme(legend.position = "bottom")
walk_map
plotly::ggplotly(walk_map)